library(AppliedPredictiveModeling)
library(tidyverse)
[37m-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.2.1 --[39m
[37m[32mv[37m [34mggplot2[37m 2.2.1 [32mv[37m [34mpurrr [37m 0.2.4
[32mv[37m [34mtibble [37m 1.4.2 [32mv[37m [34mdplyr [37m 0.7.5
[32mv[37m [34mtidyr [37m 0.8.1 [32mv[37m [34mstringr[37m 1.3.1
[32mv[37m [34mreadr [37m 1.1.1 [32mv[37m [34mforcats[37m 0.3.0[39m
[37m-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[37m [34mdplyr[37m::[32mfilter()[37m masks [34mstats[37m::filter()
[31mx[37m [34mdplyr[37m::[32mlag()[37m masks [34mstats[37m::lag()[39m
library(caret)
Loading required package: lattice
Attaching package: 㤼㸱caret㤼㸲
The following object is masked from 㤼㸱package:purrr㤼㸲:
lift
data(segmentationOriginal)
segmentationOriginal <- as_tibble(segmentationOriginal)
segmentationOriginal
seg_data <- subset(segmentationOriginal, Case == "Train")
seg_data
cell_id <- seg_data$Case
class <- seg_data$Class
case <- seg_data$Case
seg_data <- seg_data[, -(1:3)]
seg_data %>% select(-contains("Status")) -> seg_data
seg_data
Skewness
library(e1071)
skewness(seg_data$AngleCh1)
[1] -0.02426252
#seg_data %>% map_dfr(skewness)
summarize_all(seg_data, skewness)
Box-Cox transform
Ch1AreaTrans <- BoxCoxTrans(seg_data$AreaCh1)
Ch1AreaTrans
Box-Cox Transformation
1009 data points used to estimate Lambda
Input data summary:
Min. 1st Qu. Median Mean 3rd Qu. Max.
150.0 194.0 256.0 325.1 376.0 2186.0
Largest/Smallest: 14.6
Sample Skewness: 3.53
Estimated Lambda: -0.9
Apply the transform with the predict function
predict(Ch1AreaTrans, head(seg_data$AreaCh1)) -> dat
dat
[1] 1.108458 1.106383 1.104520 1.103554 1.103607 1.105523
Or perform it all at once via caret::preProcess
percent_variance[1:3]
[1] 20.91236 17.01330 11.88689
Near zero variance
nearZeroVar(seg_data)
integer(0)
Correlations
correlations <- cor(seg_data)
dim(correlations)
[1] 58 58
correlations[1:4, 1:4]
AngleCh1 AreaCh1 AvgIntenCh1 AvgIntenCh2
AngleCh1 1.000000000 -0.002627172 -0.04300776 -0.01944681
AreaCh1 -0.002627172 1.000000000 -0.02529739 -0.15330301
AvgIntenCh1 -0.043007757 -0.025297394 1.00000000 0.52521711
AvgIntenCh2 -0.019446810 -0.153303007 0.52521711 1.00000000